{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "将句中识别的实体与知识库中实体进行匹配，解决实体歧义问题。 可利用上下文本相似度进行识别。\n",
    "\n",
    "在data/entity_disambiguation目录中，entity_list.csv是50个实体，valid_data.csv是需要消歧的语句。\n",
    "\n",
    "答案提交在submit目录中，命名为entity_disambiguation_submit.csv。格式为：第一列是需要消歧的语句序号，第二列为多个“实体起始位坐标-实体结束位坐标：实体序号”以“|”分隔的字符串。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import jieba\n",
    "import pandas as pd\n",
    "\n",
    "# TODO：将entity_list.csv中已知实体的名称导入分词词典\n",
    "entity_data = pd.read_csv('../data/entity_disambiguation/entity_list.csv', encoding = 'utf-8')\n",
    "\n",
    "\n",
    "# TODO：对每句句子识别并匹配实体     \n",
    "valid_data = pd.read_csv('../data/entity_disambiguation/valid_data.csv', encoding = 'gb18030')\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 建立关键词组\n",
    "import collections\n",
    "\n",
    "s = ''\n",
    "keyword_list = []\n",
    "for i in entity_data['entity_name'].values.tolist():\n",
    "    s += i + '|'\n",
    "for k,v in collections.Counter(s.split('|')).items():\n",
    "    if v > 1:\n",
    "        keyword_list.append(k)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Building prefix dict from the default dictionary ...\n",
      "Loading model from cache /var/folders/1y/y3wkm_bd157fxlr0npb5zv9r0000gn/T/jieba.cache\n",
      "Loading model cost 1.146 seconds.\n",
      "Prefix dict has been built succesfully.\n"
     ]
    }
   ],
   "source": [
    "# 生成tfidf矩阵\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "\n",
    "train_sentence = []\n",
    "for i in entity_data['desc'].values:\n",
    "    train_sentence.append(' '.join(jieba.cut(i)))\n",
    "\n",
    "vectorizer = TfidfVectorizer()\n",
    "X = vectorizer.fit_transform(train_sentence)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "from sklearn.metrics.pairwise import cosine_similarity\n",
    "\n",
    "# 获取包含关键词的句子中关键词所属的entity_id\n",
    "def get_entityid(sentence):\n",
    "    id_start = 1001 \n",
    "    a_list = [' '.join(jieba.cut(sentence))]\n",
    "    res = cosine_similarity(vectorizer.transform(a_list),X)[0]\n",
    "    top_idx = np.argsort(res)[-1]\n",
    "    return id_start + top_idx"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# TODO：将计算结果存入文件\n",
    "\n",
    "row = 0\n",
    "result_data = []\n",
    "neighbor_sentence = ''\n",
    "for sentence in valid_data['sentence']:\n",
    "    res = [row]\n",
    "    for keyword in keyword_list:\n",
    "        if keyword in sentence:\n",
    "            # 查询关键词在句子中的索引\n",
    "            k_len = len(keyword)\n",
    "            ss =''\n",
    "            for i in range(len(sentence)-k_len+1):\n",
    "                if sentence[i:i+k_len] == keyword:\n",
    "                    s = str(i) + '-' +str(i+k_len) + ':'  # 拿到 x-x:\n",
    "                    if i > 10 and i + k_len < len(sentence)-9:\n",
    "                        neighbor_sentence = sentence [i-10:i+k_len+9]\n",
    "                    elif i < 10:\n",
    "                        neighbor_sentence = sentence [:20]\n",
    "                    elif i + k_len > len(sentence)-9:\n",
    "                        neighbor_sentence = sentence [-20:]\n",
    "                    s +=  str(get_entityid(neighbor_sentence))   # 拿到 x-x:id\n",
    "                    ss += s + '|'    # 拿到 x-x:id|x-x:id\n",
    "            res.append(ss[:-1])  # 拼接成[0, 'x-x:id|x-x:id']\n",
    "    result_data.append(res)\n",
    "    row += 1\n",
    "pd.DataFrame(result_data).to_csv('../submit/entity_disambiguation_submit.csv', index=False)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>3-6:1008|109-112:1008|187-190:1008</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>18-21:1008</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>23-26:1006|40-43:1006</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>7-10:1006</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>2-5:1006|14-17:1006</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>5</td>\n",
       "      <td>28-30:1001|34-36:1001|41-43:1001</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>6</td>\n",
       "      <td>4-6:1001|25-27:1001|34-36:1001|94-96:1001|100-...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>7</td>\n",
       "      <td>0-2:1001|6-8:1001|19-21:1009|34-36:1001|45-47:...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>8</td>\n",
       "      <td>8-10:1001|22-24:1001|34-36:1001|37-39:1001|46-...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>9</td>\n",
       "      <td>14-16:1002</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>10</td>\n",
       "      <td>0-2:1004</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>11</td>\n",
       "      <td>7-9:1005|20-22:1004</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>12</td>\n",
       "      <td>4-6:1004|29-31:1004|62-64:1004</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>13</td>\n",
       "      <td>26-28:1004</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>14</td>\n",
       "      <td>0-2:1004|24-26:1004</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>15</td>\n",
       "      <td>10-12:1004|28-30:1004</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>16</td>\n",
       "      <td>6-8:1004|20-22:1004</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>17</td>\n",
       "      <td>8-10:1011|26-28:1011</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>18</td>\n",
       "      <td>9-11:1011|28-30:1013</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>19</td>\n",
       "      <td>0-2:1013|18-20:1013</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>20</td>\n",
       "      <td>0-2:1013|6-8:1013|12-14:1013|25-27:1001</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>21</td>\n",
       "      <td>0-2:1013|26-28:1013|41-43:1011|56-58:1011</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>22</td>\n",
       "      <td>0-2:1012|20-22:1012</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>23</td>\n",
       "      <td>0-2:1013|18-20:1016</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>24</td>\n",
       "      <td>0-2:1012|32-34:1013</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>25</td>\n",
       "      <td>0-3:1015</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>26</td>\n",
       "      <td>2-5:1015|11-14:1015|18-21:1015</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>27</td>\n",
       "      <td>20-23:1015</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>28</td>\n",
       "      <td>11-14:1015</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     0                                                  1\n",
       "0    0                 3-6:1008|109-112:1008|187-190:1008\n",
       "1    1                                         18-21:1008\n",
       "2    2                              23-26:1006|40-43:1006\n",
       "3    3                                          7-10:1006\n",
       "4    4                                2-5:1006|14-17:1006\n",
       "5    5                   28-30:1001|34-36:1001|41-43:1001\n",
       "6    6  4-6:1001|25-27:1001|34-36:1001|94-96:1001|100-...\n",
       "7    7  0-2:1001|6-8:1001|19-21:1009|34-36:1001|45-47:...\n",
       "8    8  8-10:1001|22-24:1001|34-36:1001|37-39:1001|46-...\n",
       "9    9                                         14-16:1002\n",
       "10  10                                           0-2:1004\n",
       "11  11                                7-9:1005|20-22:1004\n",
       "12  12                     4-6:1004|29-31:1004|62-64:1004\n",
       "13  13                                         26-28:1004\n",
       "14  14                                0-2:1004|24-26:1004\n",
       "15  15                              10-12:1004|28-30:1004\n",
       "16  16                                6-8:1004|20-22:1004\n",
       "17  17                               8-10:1011|26-28:1011\n",
       "18  18                               9-11:1011|28-30:1013\n",
       "19  19                                0-2:1013|18-20:1013\n",
       "20  20            0-2:1013|6-8:1013|12-14:1013|25-27:1001\n",
       "21  21          0-2:1013|26-28:1013|41-43:1011|56-58:1011\n",
       "22  22                                0-2:1012|20-22:1012\n",
       "23  23                                0-2:1013|18-20:1016\n",
       "24  24                                0-2:1012|32-34:1013\n",
       "25  25                                           0-3:1015\n",
       "26  26                     2-5:1015|11-14:1015|18-21:1015\n",
       "27  27                                         20-23:1015\n",
       "28  28                                         11-14:1015"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.read_csv('../submit/entity_disambiguation_submit.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "basic",
   "language": "python",
   "name": "basic"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
